---
title: "Checking Temporal and Spatial Autocorrelation"
author: "Kerice Doten-Snitker"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, echo = FALSE, cache = FALSE}
knitr::opts_chunk$set(echo = TRUE)
opts_chunk$set(dev = c('png', 'pdf'), 
        fig.align = 'center', fig.height = 4, fig.width = 5, 
        fig.path=here::here("Diffusion", "out_figures/", "Autocorrelation/")) 
```
```{r load_pander_methods, echo = FALSE}
require(pander)
panderOptions('round', 2)
panderOptions('digits',7)
panderOptions('keep.trailing.zeros', TRUE)
panderOptions('table.style', 'multiline')
#panderOptions('table.split.table', Inf)
```
```{r load_ggplot_options, echo=FALSE}
mytheme <- theme_bw() + 
  theme(legend.position="right", legend.direction="vertical", 
        legend.background = element_rect(fill="transparent", colour=NA),
        legend.key = element_rect(fill="transparent", colour=NA),
        legend.key.size=unit(.33,"cm"),
        legend.title=element_text(size=12),
        legend.text=element_text(size=12),
        panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(color = "black"),
        text=element_text(family="sans"),
        plot.title=element_text(size=16),
        plot.subtitle=element_text(size=14),
        axis.text=element_text(size=12,face="plain"),
        axis.title=element_text(size=12,face="plain"))
```
 
## Background

explanation of why looking at autocorrelation is important. I don't expect autocorrelation for my dependent variable, expulsion, for two reasons: 
1) Places with expulsion in one period drop out of the sample, unless they readmit Jews. 
2) Expulsion was uncommon, so just based on the numbers, bivariate correlation with anything is unlikely. 

```{r add_data, echo=FALSE}
# p_load(spatstat, ape, raster, spdep, sf)

# pull a df of the city coordinates, which are not in the cleaned model data
cities <- mydata_geo[c("PlaceID","XCOORD","YCOORD")] %>% # as dataframe
  filter(PlaceID %in% model_data$PlaceID) %>%
  distinct

```

## Spatial Autocorrelation

First order effects are those that are driven by underlying variation in the properties of a space. Second order effects look at spatial distributions of data as some interaction process with the spatial distribution, affected not just by the space but by the distribution of a phenomenon within that space. To understand whether expulsions were spatially correlated, I need to establish a) the autocorrelation of city locations, b) the autocorrelation of expulsions, c) the autocorrelation of non-expulsions.

1. Average Nearest Neighbor (ANN)
important point: this assumes there is no other spatial trend in the distribution of points - i.e., there is no temporal process that creates spatial drift

2. Monte Carlo test
If I simulate 1000 random distributions of points, where does the observed ANN fit with these similuations?

Do random assignment of expulsions to cities and then compare ANN!

3. Moran's I using polygons
If I make polygons out of all the points, is expulsion distributed among neighbors?


### Monte Carlo Average Nearest Neighbor  
  
```{r spatial_autocorr_setup, echo=FALSE, eval=TRUE}

model_data_poolall <- model_data %>%
  dplyr::select(Year, PlaceID, Vrtrb) %>%
  group_by(PlaceID) %>%
  summarize(Expl = ifelse(sum(Vrtrb)>0,1,0))
  
sims <- replicate(1000, {
  sample(model_data_poolall$Expl)
  })

colnames(sims) <- colnames(sims, do.NULL=FALSE, prefix = "sim")

model_data_poolall <- model_data_poolall %>%
  merge(cities, by="PlaceID", all.x=TRUE, all.y=FALSE) %>% #keep all obs in the model data only
  cbind(sims)

# give it a buffer in the range of the box by +/-500m
model_data_poolall_ppp <- ppp(x=model_data_poolall$XCOORD, 
                              y=model_data_poolall$YCOORD,
                              xrange=c(min(model_data_poolall$XCOORD)-500, 
                                       max(model_data_poolall$XCOORD)+500),
                              yrange=c(min(model_data_poolall$YCOORD)-500, 
                                       max(model_data_poolall$YCOORD)+500),
                              marks=model_data_poolall[,c(1:2,5:1004)])

```

The most important thing is to check whether the observed distribution of distances to neighbors is different from simulated distributions of distances to neighbors (grouped by whether expelling or not). Are the observed spatial distributions different from random assignment?

```{r spatial_autocorr_ANN, fig.height = 4, fig.width = 6, echo=FALSE, eval=TRUE}
# for observed distribution of expulsions
ANN <- apply(nndist(model_data_poolall_ppp, k=1:90, # 90 expulsion locations
                    by=as.factor(model_data_poolall$Expl)), 2, FUN=mean)
# plot(ANN[str_starts(names(ANN),"1")], type="b", main=NULL, las=1, col="blue")
# points(ANN[str_starts(names(ANN),"0")], type="b", las=1, col="red")

ANN.obs <- ANN %>%
  tibble::enframe() %>%
  mutate(Distance = value,
         Condition = if_else(str_detect(name, "[0]\\."),"No","Yes"),
         KNN = as.numeric(str_extract(name, "(?<=\\.)[0-9]+")),
         Simulation = NA) %>%
  dplyr::select(Simulation, Condition, KNN, Distance)

# for simulated distributions of expulsions -------
ANN.array <- array(numeric(),c(2,90,0))
for (n in 5:1004) {
  ANN <- apply(nndist(model_data_poolall_ppp, k=1:90, # 90 expulsions
                    by=as.factor(model_data_poolall[,n])), 2, FUN=mean) 
  ANN.array <- abind(ANN.array, t(array(ANN,dim = c(90,2))))
}

dimnames(ANN.array) <- list(Condition=c("No", "Yes"), KNN=1:90, Simulation=1:1000)

ANN.tbl <- ANN.array %>%
  as.tbl_cube(met_name = "Distance") %>%
  as_tibble %>%
  rbind(., ANN.obs) %>%
  mutate(DistanceKM = Distance/1000) # meters to kilometers

# visualize the comparison

ANNplot1 <- ggplot(data=ANN.tbl[!is.na(ANN.tbl$Simulation),], aes(x=KNN, y=DistanceKM, group=interaction(Simulation, Condition), color = Condition)) + 
  geom_line() +
  geom_line(aes(x=KNN, y=DistanceKM, group=Condition),
            ANN.tbl[is.na(ANN.tbl$Simulation),],
             col="black", size=2, linetype="dashed") +
  labs(title="", 
    x="K nearest neighbor", y="Average distance (km)") +
  mytheme +
  theme(plot.title=element_text(size=16),
        plot.subtitle=element_text(size=14),
        axis.text=element_text(size=12,face="plain"),
        axis.title=element_text(size=14,face="bold"))

plot(ANNplot1)
```

The observed and simulated distances to neighboring expulsions are increadibly consistent. There is some variation in the simulated distances to expelling neighbors, and the observed distribution generally remains within the bounds of the simulations. However, the observed distances for expellers are on the upper extreme for about the first 25 neighbors before switching to the lower extreme. That means that, among expellers, nearer neighbors are farther than might be expected, and farther neighbors are closer than expected. Plus, (expelling) neighbors are farther for expellers than (non-expelling) neighbors are for non-expellers. *I interpreted this wrong in the previous version*
  
```{r spatial_autocorr_ANN_split_1, fig.height = 4, fig.width = 6, eval = TRUE, echo=FALSE}
# I think this should be the same as above, but with different code
# for multiple distributions of expulsions (i.e., the simulations)
# now, segmented to within-condition distances ---------------------------------

ANN.array_strat <- array(numeric(),c(2,89,0))
for (n in 3:1002) {
  # subset to all 1's in column n and all 0's
  data_sub_yes <- model_data_poolall_ppp[model_data_poolall_ppp[["marks"]][,n]==1] 
  data_sub_no <- model_data_poolall_ppp[model_data_poolall_ppp[["marks"]][,n]==0] 
  ANN_yes <- apply(nndist(data_sub_yes, k=1:89), # 89 other expulsions
                    2, FUN=mean) # mean by column (sim)
  ANN_no <- apply(nndist(data_sub_no, k=1:89),
                    2, FUN=mean)
  ANN.array_strat <- abind(ANN.array_strat, t(array(c(ANN_no,ANN_yes),dim = c(89,2))))
}

dimnames(ANN.array_strat) <- list(Condition=c("No", "Yes"), KNN=1:89, Simulation=1:1000)

ANN.strat.tbl <- ANN.array_strat %>%
  as.tbl_cube(met_name = "Distance") %>%
  as_tibble %>%
  mutate(DistanceKM = Distance/1000) # meters to kilometers

# ANNplot2 <- ggplot(data=ANN.strat.tbl, aes(x=KNN, y=DistanceKM, group=interaction(Simulation, Condition), color = Condition)) + 
#   geom_line() +
#   #geom_smooth(aes(group = 1), size = 2, method = "lm", se = FALSE)
#   labs(title="", 
#     x="K nearest neighbor", y="Average distance (km)")
# 
# plot(ANNplot2)
```


#### Comparing the observed and simulated distributions of first nearest neighbors

```{r spatial_autocorr_ANN_split_2, fig.height = 4, fig.width = 8, echo=FALSE}
# comparison to the observed distribution -------------------------

ANN.obs.yes <- apply(nndist(
  model_data_poolall_ppp[model_data_poolall_ppp[["marks"]]["Expl"]==1], 
                    k=1:89), 2, FUN=mean)
# 89 because if there are 90 expulsion locations total, each one has 89 alters
ANN.obs.no <- apply(nndist(
  model_data_poolall_ppp[model_data_poolall_ppp[["marks"]]["Expl"]==0], 
                    k=1:89), 2, FUN=mean)

# using base graphics for individual plots
# hist(ANN.strat.tbl$Distance[ANN.strat.tbl$Condition=="Yes"&ANN.strat.tbl$KNN==1], main=NULL, las=1, breaks=20, col="bisque")
# abline(v=ANN.obs.yes[1], col="blue")
# 
# hist(ANN.strat.tbl$Distance[ANN.strat.tbl$Condition=="No"&ANN.strat.tbl$KNN==1], main=NULL, las=1, breaks=20, col="bisque")
# abline(v=ANN.obs.no[1], col="blue")

ANN.obs.strat <- ANN.obs.no %>%
  cbind(ANN.obs.yes) %>%
  as.data.frame() %>% # must be df to gather, and easier for naming
  setNames(., c("No", "Yes")) %>%
  mutate(KNN=1:n()) %>% # add as index of which neighbor
  gather(Condition, Distance, -KNN) %>% # keep the KNN value
  mutate(DistanceKM = Distance/1000) # meters to kilometers

# first, a function to make the number of axis breaks consistent across period facets - from https://stackoverflow.com/questions/28436855/change-the-number-of-breaks-using-facet-grid-in-ggplot2
equal_breaks <- function(n = 3, s = 0.05, ...){
  function(x){
    # rescaling
    d <- s * diff(range(x)) / (1+2*s)
    round(seq(min(x)+d, max(x)-d, length=n),1)
  }
}

# plot on the same plot, two panels
# Monte Carlo Simulations of Average Nearest Neighbor, with observed averages
ANNplot3 <- ggplot(data=ANN.strat.tbl[ANN.strat.tbl$KNN==1,], aes(DistanceKM)) + 
  geom_histogram(bins=20, color="black", fill=NA) +
  geom_vline(aes(xintercept=DistanceKM), ANN.obs.strat[ANN.obs.strat$KNN==1,], 
             col="black", size=1, linetype="dashed") +
  scale_y_continuous(expand=c(0,0), limits=c(0,150), breaks=c(seq(25,150,25))) +
  facet_wrap(~Condition, scales = 'free_x', 
             labeller = as_labeller(c(No = "Among non-expelling cities",
                                      Yes = "Among expelling cities"))) +
  scale_x_continuous(breaks=equal_breaks(n=3,s=0.5), 
                     expand = c(0.05, 0)) +
  labs(title="", 
    x="Distance to nearest neighbor (km)", y="Frequency of Simulations") +
  mytheme  +
  theme(plot.title=element_text(size=16),
        plot.subtitle=element_text(size=14),
        axis.text=element_text(size=12,face="plain"),
        axis.title=element_text(size=14,face="bold"),
        strip.text.x=element_text(size=12,face="bold"))

plot(ANNplot3)

# handy reference values
observed_n1_yes_km <- round(ANN.obs.strat$DistanceKM[ANN.obs.strat$KNN==1&ANN.obs.strat$Condition=="Yes"], 2)
sim_n1_yes_km <- round(ANN.strat.tbl$DistanceKM[ANN.strat.tbl$KNN==1&ANN.strat.tbl$Condition=="Yes"], 2)
observed_n1_no_km <- round(ANN.obs.strat$DistanceKM[ANN.obs.strat$KNN==1&ANN.obs.strat$Condition=="No"], 2)
sim_n1_no_km <- round(ANN.strat.tbl$DistanceKM[ANN.strat.tbl$KNN==1&ANN.strat.tbl$Condition=="No"], 2)
```

    
  The distribution of expulsions is *closer* together than the bulk of simulations, but where does it fit with the distrubtion of the simulations, as a percentile? The observed average nearest expelling neighbor is `r round(observed_n1_yes_km, 2)` kilometers away. In these simulations, the observed distance is below `r 100-(ecdf(sim_n1_yes_km)(observed_n1_yes_km)*100)`% of the simulated neighbor distances. This is evidence that the distribution of expulsions is non-random; the observed expulsions are closer together than random distribution would predict.
 
  The distribution of non-expulsions is *also closer* than the bulk of simulations, but where does it fit with the distrubtion of the simulations, as a percentile? The observed average nearest non-expelling neighbor is `r round(observed_n1_no_km, 2)` kilometers away. In these simulations, the observed distance is below `r 100-(ecdf(sim_n1_no_km)(observed_n1_no_km)*100)`% of the simulated neighbor distances. We cannot conclude that the distribution of non-expulsions is non-random.
  
  
## Now split by time periods, instead of all pooled together

The study covers 135 years. To split this into even and manageable time frames, I divided into 5 periods of 27 years. This is a little unwieldy, but they actually to match up rather well to potential trends within expulsions: pre-Constance, post-Constance, the mid-century lull but at the time of Nicholas of Cusa, the late-century uptick in political conflict and the rise of the princes, and the urban rebellions as well as revivals on the eve of the Reformation.

### Redo the Monte Carlo Average Nearest Neighbor by time slices  
  
```{r tempospatial_autocorr_setup, echo=FALSE}

model_data_periods <- model_data %>%
  mutate(Period27yrs = cut(model_data$Year, 5, dig.lab=4)) %>%
  dplyr::select(Year, PlaceID, Vrtrb, Period27yrs) %>%
  group_by(PlaceID, Period27yrs) %>%
  summarize(Expl = ifelse(sum(Vrtrb)>0,1,0)) %>%
  ungroup() %>%
  merge(cities, by="PlaceID", all.x=TRUE, all.y=FALSE) %>% #keep all obs in the model data only, need to nest this info in period-based tibbles
  group_by(Period27yrs) %>% # get ready to split on period!
  nest(.key="data") %>% # now 5 subsets, by 27 year spans
  mutate(sims = map(data,
                    ~data.frame(replicate(1000, {sample(.x$Expl)}))
                    )) # colnames are auto: X1:X1000

# make a list-column that includes both the observed and simulated
model_data_nested <- model_data_periods %>%
  mutate(nested = map2(data, sims,
                      ~cbind(.x,.y)))

# convert the items in the new list-column to ppp objects for doing KNN
# don't keep PlaceID - not vital, and interferes with nndist segmenting
model_data_nested_ppp <- model_data_nested %>%
  mutate(nested_ppp = map(nested, ~ppp(x=.x$XCOORD, 
                              y=.x$YCOORD,
                              xrange=c(min(.x$XCOORD-500), 
                                       max(.x$XCOORD+500)),
                              yrange=c(min(.x$YCOORD-500), 
                                       max(.x$YCOORD+500)),
                              marks=.x[,c(2,5:1004)])
                          )) %>%
  dplyr::select(-nested, -data, -sims)

```
 
   
```{r tempospatial_autocorr_ANN, fig.height = 7, fig.width = 8, echo=FALSE, eval=TRUE}

# make a new list-column for storing nearest neighbor calculations
model_data_nested_ppp$ANN <- vector("list", dim(model_data_nested_ppp)[1])

# loop through period ppp objects, subsetting based on simulated locations of expulsions + the observed locations
for (i in 1:dim(model_data_nested_ppp)[1]){
  # 1) create a df to store NN results for ppp object i
  ANN <- data.frame("nestcol"=NA,"ANN_no"=NA,"ANN_yes"=NA)
  # keeping col number will help tell observed (1) from sims (2:1001)
  for (j in 1:1001) { # columns are (Expl, Sims(1:1000))
    # subset each period ppp to all 1's in column j and all 0's
    data_sub_yes <- model_data_nested_ppp$nested_ppp[[i]][
                           model_data_nested_ppp$nested_ppp[[i]][["marks"]][,j]==1]
    data_sub_no <- model_data_nested_ppp$nested_ppp[[i]][
                           model_data_nested_ppp$nested_ppp[[i]][["marks"]][,j]==0] 
    # calculate first nearest neighbor for each, and mean of the subset
    ANN_yes <- mean(nndist(data_sub_yes, k=1))/1000 # to km!
    ANN_no <- mean(nndist(data_sub_no, k=1))/1000
    # store results from this (either observed or simulated) column
    ANN[j,] <- data.frame(cbind(j, ANN_no, ANN_yes)) 
  }
  model_data_nested_ppp$ANN[[i]] <- ANN
}

# make a df of just period info plus nearest neighbor results
ANN.periods.strat <- model_data_nested_ppp %>%
  # turn ANN into a "long" version
  mutate(ANN_plot = map(ANN,
                        ~.x%>%
                          gather(Condition, DistanceKM, -nestcol))) %>%
  dplyr::select(Period27yrs,ANN_plot) %>%
  # unnesting makes the structure one long, normal df again!
  unnest(ANN_plot)

# ANN.periods.strat$Period.f <- factor(ANN.periods.strat$Period27yrs, 
#                                         levels = c(1,2,3,4,5),
#                                         labels = c("1385-", "1412-",
#                                                    "1439-", "1466-",
#                                                    "1493-"))
#   already a factor

# Monte Carlo Simulations of Average Nearest Neighbor, with observed averages
# plot on the same plot - 5 periods x 2 conditions is a lot!
# for simplicity, just within expulsions + only if >1 expulsions in a period
# (some periods there are no "neighbors" because only 1 expulsion, and DistanceKM is Inf)

# the plot, for expelling cities only
ANNplot4 <- ggplot(data=ANN.periods.strat[ANN.periods.strat$nestcol!=1&ANN.periods.strat$Condition=="ANN_yes"&is.finite(ANN.periods.strat$DistanceKM),], 
                   aes(DistanceKM)) + 
  geom_histogram(bins=20, color="black", fill=NA) +
  geom_vline(aes(xintercept=DistanceKM),
             ANN.periods.strat[ANN.periods.strat$nestcol==1&ANN.periods.strat$Condition=="ANN_yes"&is.finite(ANN.periods.strat$DistanceKM),],
             col="black", size=1, linetype="dashed") +
  scale_y_continuous(expand=c(0,0), limits=c(0,175), breaks=c(seq(25,150,25))) +
  facet_wrap(~Period27yrs, scales = 'free_x') +
  # use 3 breaks, 
  # use same s as first expand argument, 
  # second expand argument should be 0
  scale_x_continuous(breaks=equal_breaks(n=3,s=0.5), 
                     expand = c(0.05, 0)) +
  labs(title="", 
    x="Distance to nearest neighbor (km)", y="Frequency of Simulations") +
  mytheme  +
  theme(plot.title=element_text(size=16),
        plot.subtitle=element_text(size=14),
        axis.text=element_text(size=12,face="plain"),
        axis.title=element_text(size=14,face="bold"),
        strip.text.x=element_text(size=12,face="bold"))

plot(ANNplot4)

```

```{r tempospatial_ANN_table, eval=TRUE, echo=FALSE}
# reference values
# for the simulations
sim_yes_km <- ANN.periods.strat %>%
  filter(nestcol>1, Condition=="ANN_yes") %>%
  dplyr::select(nestcol, Period27yrs, DistanceKM) %>% # need nestcol as id for spread
  spread(Period27yrs, DistanceKM, sep="_") %>%
  dplyr::select(-nestcol) # now drop it!
# for the observed values
observed_yes_km <- ANN.periods.strat %>%
  filter(nestcol==1, Condition=="ANN_yes") %>%
  dplyr::select(nestcol, Period27yrs, DistanceKM) %>% # need nestcol as id for spread
  spread(Period27yrs, DistanceKM, sep="_") %>%
  dplyr::select(-nestcol) # now drop it!
# a table of percentile values for the observed first ANN compared to sims, by period
ANN_table <- data.frame(
  "Period" = c("1385-1411", "1412-1438","1439-1465", "1466-1492","1493-1520"),
  "Observed"= c(as.numeric(observed_yes_km[,1]),as.numeric(observed_yes_km[,2]),as.numeric(observed_yes_km[,3]),as.numeric(observed_yes_km[,4]),as.numeric(observed_yes_km[,5])),
  "Percentile" = 
    c((ecdf(unlist(sim_yes_km[,1]))(unlist(observed_yes_km[1,1]))*100),
      (ecdf(unlist(sim_yes_km[,2]))(unlist(observed_yes_km[1,2]))*100),
      (ecdf(unlist(sim_yes_km[,3]))(unlist(observed_yes_km[1,3]))*100),
      (ecdf(unlist(sim_yes_km[,4]))(unlist(observed_yes_km[1,4]))*100),
      (ecdf(unlist(sim_yes_km[,5]))(unlist(observed_yes_km[1,5]))*100)))
```
```{r ANN_period_table, eval=TRUE, results='asis', echo=FALSE}

stargazer(ANN_table, type = "html", 
  summary = FALSE, align=TRUE)

asis_output("\\br")
```

For expelling cities, the observed values are not significantly different from the simulated values, at least in the p<0.05 sense. The observed average nearest expelling neighbor ranges from `r round(min(ANN_table$Observed), 2)` kilometers away to `r round(max(ANN_table$Observed), 2)` kilometers away. The observed distance is closer than the simulated neighbor distances in 1385-1411 and in 1439-1465, spot on in 1412-1438 and 1493-1520, and farther apart in 1466-1492. One thing to pay attention to is the actual distances. Simulated nearest neighbors for 1385-1411 are actually very far apart, with a range much higher than the other periods. Though the observed value is most different from the simulated ones, it is actually farther away than at any other period. This is not strong evidence that the distribution of expulsions is non-random, but it does leave open that the distribution might change over time. The shrink in the simulated distances, and the variation in difference between the observed and simulated distances is some small evidence of spatio-temporal relationships, but not of a direct relationship just between these three factors. I argue that other factors, which may also have spatial and temporal variation, are what provide the link. 

```{r tempospatial_ANN_no_table, fig.height = 7, fig.width = 8, eval=TRUE, echo=FALSE}
# the plot, for non-expelling cities only
ANNplot5 <- ggplot(data=ANN.periods.strat[ANN.periods.strat$nestcol!=1&ANN.periods.strat$Condition=="ANN_no"&is.finite(ANN.periods.strat$DistanceKM),], 
                   aes(DistanceKM)) + 
  geom_histogram(bins=20, color="black", fill=NA) +
  geom_vline(aes(xintercept=DistanceKM),
             ANN.periods.strat[ANN.periods.strat$nestcol==1&ANN.periods.strat$Condition=="ANN_no"&is.finite(ANN.periods.strat$DistanceKM),],
             col="black", size=1, linetype="dashed") +
  scale_y_continuous(expand=c(0,0), limits=c(0,175), breaks=c(seq(25,150,25))) +
  facet_wrap(~Period27yrs, scales = 'free_x') +
  # use 3 breaks, 
  # use same s as first expand argument, 
  # second expand argument should be 0
  scale_x_continuous(breaks=equal_breaks(n=3,s=0.5), 
                     expand = c(0.05, 0)) +
  labs(title="", 
    x="Distance to nearest neighbor (km)", y="Frequency of Simulations") +
  mytheme  +
  theme(plot.title=element_text(size=16),
        plot.subtitle=element_text(size=14),
        axis.text=element_text(size=12,face="plain"),
        axis.title=element_text(size=14,face="bold"),
        strip.text.x=element_text(size=12,face="bold"))

plot(ANNplot5)

# reference values
# for the simulations
sim_no_km <- ANN.periods.strat %>%
  filter(nestcol>1, Condition=="ANN_no") %>%
  dplyr::select(nestcol, Period27yrs, DistanceKM) %>% # need nestcol as id for spread
  spread(Period27yrs, DistanceKM, sep="_") %>%
  dplyr::select(-nestcol) # now drop it!
# for the observed values
observed_no_km <- ANN.periods.strat %>%
  filter(nestcol==1, Condition=="ANN_no") %>%
  dplyr::select(nestcol, Period27yrs, DistanceKM) %>% # need nestcol as id for spread
  spread(Period27yrs, DistanceKM, sep="_") %>%
  dplyr::select(-nestcol) # now drop it!
# a table of percentile values for the observed first ANN compared to sims, by period
ANN_no_table <- data.frame(
  "Period" = c("1385-1411", "1412-1438","1439-1465", "1466-1492","1493-1520"),
  "Observed"= c(as.numeric(observed_no_km[,1]),as.numeric(observed_no_km[,2]),as.numeric(observed_no_km[,3]),as.numeric(observed_no_km[,4]),as.numeric(observed_no_km[,5])),
  "Percentile" = 
    c((ecdf(unlist(sim_no_km[,1]))(unlist(observed_no_km[1,1]))*100),
      (ecdf(unlist(sim_no_km[,2]))(unlist(observed_no_km[1,2]))*100),
      (ecdf(unlist(sim_no_km[,3]))(unlist(observed_no_km[1,3]))*100),
      (ecdf(unlist(sim_no_km[,4]))(unlist(observed_no_km[1,4]))*100),
      (ecdf(unlist(sim_no_km[,5]))(unlist(observed_no_km[1,5]))*100)))
```
```{r ANN_no_period_table, eval=TRUE, results='asis', echo=FALSE}

stargazer(ANN_no_table, type = "html", 
  summary = FALSE, align=TRUE)

asis_output("\\br")
```

On the contrary, the simulated as well as observed nearest neighbor distances for non-expellers remain essentially steady across the time periods. Nothing observed is unusual compared to the simulations. These values generally show a left-side tail, which makes sense because the actual distribution is that some cities were in dense areas, but not the majority of them.

All in all, expelling cities are not particularly close, though they may get closer through time. Non-expelling cities are closer to non-expelling cities.

## Trying something a little different: Moran's I with polygons

```{r make_polys, eval=TRUE, echo=FALSE}
# first, make the data into a spatial points dataframe
model_data_poolall_sp <- model_data_poolall
coordinates(model_data_poolall_sp) <- ~ XCOORD + YCOORD
crs_EPSG3035 <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"
projection(model_data_poolall_sp) <- crs_EPSG3035

# then, turn the cities points into voronoi/dirichlet/thiessen polygons
# code from M. Gimond: https://mgimond.github.io/Spatial/interpolation-in-r.html#thiessen-polygons

# Create a tessellated surface
model_data_poolall_polys <- as(spatstat::dirichlet(model_data_poolall_ppp), "SpatialPolygons")

# A ppp does not have projection information?
# requiring that this information be added manually
proj4string(model_data_poolall_polys) <- crs_EPSG3035

# The tessellated surface does not store attribute information
# from the point data layer. We'll use the over() function (from the sp
# package) to join the point attributes to the tesselated surface via
# a spatial join. The over() function creates a dataframe that will need to
# be added to the `model_data_poolall_polys` object thus creating a SpatialPolygonsDataFrame object
# for some reason one of the Expl values is becoming NA here
model_data_poolall_polys.z <- over(model_data_poolall_polys, model_data_poolall_sp)
model_data_poolall_polys.spdf <-  SpatialPolygonsDataFrame(model_data_poolall_polys, model_data_poolall_polys.z)

# Move on to calculating Moran's I: first, identifying neighbors
# Still relying on M. Gimond

p_load(spdep)
nb <- poly2nb(model_data_poolall_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
lw <- nb2listw(nb, style="W", zero.policy=TRUE)
# compute the spatial lag of expulsions
# NOTE: this is problematic because it doesn't include time; an expulsion in N1 could be "lagged" but occur after an expulsion in the city...
Expl.lag <- lag.listw(lw, model_data_poolall_polys.spdf$Expl)

# now, actually do the computation, Monte Carlo style
MI.cont<- moran.mc(model_data_poolall_polys.spdf$Expl,lw, nsim=500)
MI.cont
plot(MI.cont, main="", las=1)

# What about using a distance band, instead of contiguous neighbors?
coo <- coordinates(model_data_poolall_polys.spdf) # of Thiessn poly centroids
model_data_poolall_polys.dist  <-  dnearneigh(coo, 0, 60000) # w/in 60km
# identify the neighbors
lw.band <- nb2listw(model_data_poolall_polys.dist, style="W", zero.policy=T) 
# calculate!
MI.band  <-  moran.mc(model_data_poolall_polys.spdf$Expl, lw.band, nsim=500, zero.policy=T) 
MI.band
plot(MI.band, main="", las=1)
```

  Basically, the distribution of expulsion is more clustered than expected, both by using Thiessen polygon contiguous neighbors and by using a distance band of 60km between Thiessen polygon centroids. The p-values <0.01 demonstrate this; the Moran's I statistics are all at the extreme positive for the simulated values. However, expulsions are not very clustered. The Moran's I statistics are tiny, <0.1. These results are probably related to the fact that there are very few expulsions.


### Try again, but split by time

```{r make_polys_periods_setup, eval=TRUE, echo=FALSE}
# first, make the data into a spatial points dataframe
model_data_period_1 <- model_data_nested$nested[[1]]
model_data_period_1_sp <- model_data_period_1
coordinates(model_data_period_1_sp) <- ~ XCOORD + YCOORD
projection(model_data_period_1_sp) <- crs_EPSG3035
# make the ppp outside the tesselation command, to set the window
model_data_period_1_ppp <- ppp(x=model_data_period_1$XCOORD, 
                              y=model_data_period_1$YCOORD,
                              xrange=c(min(model_data_period_1$XCOORD)-500, 
                                       max(model_data_period_1$XCOORD)+500),
                              yrange=c(min(model_data_period_1$YCOORD)-500, 
                                       max(model_data_period_1$YCOORD)+500),
                              marks=model_data_period_1[,c(1:2,5:1004)])
model_data_period_1_polys <- as(spatstat::dirichlet(model_data_period_1_ppp), "SpatialPolygons")
proj4string(model_data_period_1_polys) <- crs_EPSG3035
model_data_period_1_polys.z <- over(model_data_period_1_polys, model_data_period_1_sp)
model_data_period_1_polys.spdf <-  SpatialPolygonsDataFrame(model_data_period_1_polys, model_data_period_1_polys.z)
# Period 2
model_data_period_2 <- model_data_nested$nested[[2]]
model_data_period_2_sp <- model_data_period_2
coordinates(model_data_period_2_sp) <- ~ XCOORD + YCOORD
projection(model_data_period_2_sp) <- crs_EPSG3035
# make the ppp outside the tesselation command, to set the window
model_data_period_2_ppp <- ppp(x=model_data_period_2$XCOORD, 
                              y=model_data_period_2$YCOORD,
                              xrange=c(min(model_data_period_2$XCOORD)-500, 
                                       max(model_data_period_2$XCOORD)+500),
                              yrange=c(min(model_data_period_2$YCOORD)-500, 
                                       max(model_data_period_2$YCOORD)+500),
                              marks=model_data_period_2[,c(1:2,5:1004)])
model_data_period_2_polys <- as(spatstat::dirichlet(model_data_period_2_ppp), "SpatialPolygons")
proj4string(model_data_period_2_polys) <- crs_EPSG3035
model_data_period_2_polys.z <- over(model_data_period_2_polys, model_data_period_2_sp)
model_data_period_2_polys.spdf <-  SpatialPolygonsDataFrame(model_data_period_2_polys, model_data_period_2_polys.z)
#Period 3
model_data_period_3 <- model_data_nested$nested[[3]]
model_data_period_3_sp <- model_data_period_3
coordinates(model_data_period_3_sp) <- ~ XCOORD + YCOORD
projection(model_data_period_3_sp) <- crs_EPSG3035
# make the ppp outside the tesselation command, to set the window
model_data_period_3_ppp <- ppp(x=model_data_period_3$XCOORD, 
                              y=model_data_period_3$YCOORD,
                              xrange=c(min(model_data_period_3$XCOORD)-500, 
                                       max(model_data_period_3$XCOORD)+500),
                              yrange=c(min(model_data_period_3$YCOORD)-500, 
                                       max(model_data_period_3$YCOORD)+500),
                              marks=model_data_period_3[,c(1:2,5:1004)])
model_data_period_3_polys <- as(spatstat::dirichlet(model_data_period_3_ppp), "SpatialPolygons")
proj4string(model_data_period_3_polys) <- crs_EPSG3035
model_data_period_3_polys.z <- over(model_data_period_3_polys, model_data_period_3_sp)
model_data_period_3_polys.spdf <-  SpatialPolygonsDataFrame(model_data_period_3_polys, model_data_period_3_polys.z)
#Period 4
model_data_period_4 <- model_data_nested$nested[[4]]
model_data_period_4_sp <- model_data_period_4
coordinates(model_data_period_4_sp) <- ~ XCOORD + YCOORD
projection(model_data_period_4_sp) <- crs_EPSG3035
# make the ppp outside the tesselation command, to set the window
model_data_period_4_ppp <- ppp(x=model_data_period_4$XCOORD, 
                              y=model_data_period_4$YCOORD,
                              xrange=c(min(model_data_period_4$XCOORD)-500, 
                                       max(model_data_period_4$XCOORD)+500),
                              yrange=c(min(model_data_period_4$YCOORD)-500, 
                                       max(model_data_period_4$YCOORD)+500),
                              marks=model_data_period_4[,c(1:2,5:1004)])
model_data_period_4_polys <- as(spatstat::dirichlet(model_data_period_4_ppp), "SpatialPolygons")
proj4string(model_data_period_4_polys) <- crs_EPSG3035
model_data_period_4_polys.z <- over(model_data_period_4_polys, model_data_period_4_sp)
model_data_period_4_polys.spdf <-  SpatialPolygonsDataFrame(model_data_period_4_polys, model_data_period_4_polys.z)
#Period 5
model_data_period_5 <- model_data_nested$nested[[5]]
model_data_period_5_sp <- model_data_period_5
coordinates(model_data_period_5_sp) <- ~ XCOORD + YCOORD
projection(model_data_period_5_sp) <- crs_EPSG3035
# make the ppp outside the tesselation command, to set the window
model_data_period_5_ppp <- ppp(x=model_data_period_5$XCOORD, 
                              y=model_data_period_5$YCOORD,
                              xrange=c(min(model_data_period_5$XCOORD)-500, 
                                       max(model_data_period_5$XCOORD)+500),
                              yrange=c(min(model_data_period_5$YCOORD)-500, 
                                       max(model_data_period_5$YCOORD)+500),
                              marks=model_data_period_5[,c(1:2,5:1004)])
model_data_period_5_polys <- as(spatstat::dirichlet(model_data_period_5_ppp), "SpatialPolygons")
proj4string(model_data_period_5_polys) <- crs_EPSG3035
model_data_period_5_polys.z <- over(model_data_period_5_polys, model_data_period_5_sp)
model_data_period_5_polys.spdf <-  SpatialPolygonsDataFrame(model_data_period_5_polys, model_data_period_5_polys.z)
```
```{r moran_polys_periods, eval=TRUE, echo=FALSE}
# Move on to calculating Moran's I: first, identifying neighbors
# Still relying on M. Gimond
p1.nb <- poly2nb(model_data_period_1_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
p1.lw <- nb2listw(p1.nb, style="W", zero.policy=TRUE)
# now, actually do the computation, Monte Carlo style
MI_p1.cont<- moran.mc(model_data_period_1_polys.spdf$Expl,p1.lw, nsim=500)

p2.nb <- poly2nb(model_data_period_2_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
p2.lw <- nb2listw(p2.nb, style="W", zero.policy=TRUE)
# now, actually do the computation, Monte Carlo style
MI_p2.cont<- moran.mc(model_data_period_2_polys.spdf$Expl,p2.lw, nsim=500)

p3.nb <- poly2nb(model_data_period_3_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
p3.lw <- nb2listw(p3.nb, style="W", zero.policy=TRUE)
# now, actually do the computation, Monte Carlo style
MI_p3.cont<- moran.mc(model_data_period_3_polys.spdf$Expl,p3.lw, nsim=500)

p4.nb <- poly2nb(model_data_period_4_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
p4.lw <- nb2listw(p4.nb, style="W", zero.policy=TRUE)
# now, actually do the computation, Monte Carlo style
MI_p4.cont<- moran.mc(model_data_period_4_polys.spdf$Expl,p4.lw, nsim=500)

p5.nb <- poly2nb(model_data_period_5_polys.spdf, queen=FALSE) # not the queen case; neighbors only if share and edge, not a vertex
# assign weights to neighbors; equal weights style="W" (try "B"?)
p5.lw <- nb2listw(p5.nb, style="W", zero.policy=TRUE)
# now, actually do the computation, Monte Carlo style
MI_p5.cont<- moran.mc(model_data_period_5_polys.spdf$Expl,p5.lw, nsim=500)

MI_p1.cont
MI_p2.cont
MI_p3.cont
MI_p4.cont
MI_p5.cont
#plot(MI_p1.cont, main="", las=1)

```

  Looking by periods, I do not find any spatial autocorrelation. All Moran's *I* statistics are effectively 0 (<0.01), and the associated *p* values show that the observed values are not different than the simulated ones (*p*>0.1).

```{r moran_polys_periods_bands, eval=TRUE, echo=FALSE}
# What about using a distance band, instead of contiguous neighbors?
p1.coo <- coordinates(model_data_period_1_polys.spdf) # of Thiessn poly centroids
model_data_period_1_polys.dist  <-  dnearneigh(p1.coo, 0, 60000) # w/in 60km
# identify the neighbors
p1.lw.band <- nb2listw(model_data_period_1_polys.dist, style="W", zero.policy=T) 
# calculate!
MI_p1.band  <-  moran.mc(model_data_period_1_polys.spdf$Expl, p1.lw.band, nsim=500, zero.policy=T) 
MI_p1.band

p2.coo <- coordinates(model_data_period_2_polys.spdf) # of Thiessn poly centroids
model_data_period_2_polys.dist  <-  dnearneigh(p2.coo, 0, 60000) # w/in 60km
# identify the neighbors
p2.lw.band <- nb2listw(model_data_period_2_polys.dist, style="W", zero.policy=T) 
# calculate!
MI_p2.band  <-  moran.mc(model_data_period_2_polys.spdf$Expl, p2.lw.band, nsim=500, zero.policy=T) 
MI_p2.band

p3.coo <- coordinates(model_data_period_3_polys.spdf) # of Thiessn poly centroids
model_data_period_3_polys.dist  <-  dnearneigh(p3.coo, 0, 60000) # w/in 60km
# identify the neighbors
p3.lw.band <- nb2listw(model_data_period_3_polys.dist, style="W", zero.policy=T) 
# calculate!
MI_p3.band  <-  moran.mc(model_data_period_3_polys.spdf$Expl, p3.lw.band, nsim=500, zero.policy=T) 
MI_p3.band

p4.coo <- coordinates(model_data_period_4_polys.spdf) # of Thiessn poly centroids
model_data_period_4_polys.dist  <-  dnearneigh(p4.coo, 0, 60000) # w/in 60km
# identify the neighbors
p4.lw.band <- nb2listw(model_data_period_4_polys.dist, style="W", zero.policy=T) 
# calculate!
MI_p4.band  <-  moran.mc(model_data_period_4_polys.spdf$Expl, p4.lw.band, nsim=500, zero.policy=T) 
MI_p4.band

p5.coo <- coordinates(model_data_period_5_polys.spdf) # of Thiessn poly centroids
model_data_period_5_polys.dist  <-  dnearneigh(p5.coo, 0, 60000) # w/in 60km
# identify the neighbors
p5.lw.band <- nb2listw(model_data_period_5_polys.dist, style="W", zero.policy=T) 
# calculate!
MI_p5.band  <-  moran.mc(model_data_period_5_polys.spdf$Expl, p5.lw.band, nsim=500, zero.policy=T) 
MI_p5.band

#plot(MI_p1.band, main="", las=1)
```

## Someday look at the distribution of IVs...

```{r spatial_distr_IVs, eval=FALSE, echo=FALSE}
plot.Schoeffen <- ggplot(data=model_data, aes(x=XCOORD, y=YCOORD)) + 
  geom_point(aes(color=factor(Schoeffen)), alpha=0.7, shape = 1) +
  scale_color_manual(values=c("gray","blue")) +
  facet_wrap(~ Period2) +
  mytheme

plot.Mint <- ggplot(data=model_data, aes(x=XCOORD, y=YCOORD)) + 
  geom_point(aes(color=factor(Mint)), alpha=0.7, shape = 1) +
  scale_color_manual(values=c("gray","blue")) +
  facet_wrap(~ Period2) +
  mytheme

plot.Mkt2 <- ggplot(data=model_data, aes(x=XCOORD, y=YCOORD)) + 
  geom_point(aes(color=factor(Mkt2)), shape = 1) +
  scale_color_brewer(palette="Blues") +
  facet_wrap(~ Period2) +
  mytheme

plot.RouteCount <- ggplot(data=model_data, aes(x=XCOORD, y=YCOORD)) + 
  geom_point(aes(color=factor(RouteCount)), shape = 1) +
  scale_color_brewer(palette="Blues") +
  facet_wrap(~ Period2) +
  mytheme
```
 
 
 
```{r spatial_autocorr_IVs, eval=FALSE, echo=FALSE}
# calculate spatial autocorrelation for ind. variables

# especially check the spatial correlation of/btwn offices


```

### R Session Info

```{r session_info}
sessionInfo()
```